Part 1

#Creating Bonus_Challange

mkdir

#Acquiring SRR accesions from the SRA run selector (input for fasterq-dump)

SRA

summary

Download data from the SRA repository and Perform a preliminary quality check of the FASTQ files.

#Creating a bash script for automizing download of fastq files, renaming them, and quality control with fastqc and multiqc. Output of fastq files will be moved to fastqc_results folder.

#Using nano for making and editing the bash script

make script

#The script

scrip1t

#Making the bash script executable and running it with nohup in the background. Cheking the process with cat nohup.out

scrip1t

#Result of the first script

scrip1t

#Moving the fastqc_results folder to computer with filezilla

scrip1t

QC with multiqc

#Per base quality scores of all samples seems normal meaning that there was no significant problem during sequencing

scrip1t

#Per sequence quality scores shows reads have high quality scores on average

scrip1t

#Per Sequence GC Content in mock_1 which is not a virus infected cell line shows abnormal GC content.

scrip1t

#Sequence Duplication Levels

scrip1t

#Over-represented sequences were present in both samples

scrip1t

#Over-represented sequences in Mock samples belonged to human genome mainly mitochondrial chormosome. This was checked by NCBI blast.

scrip1t

#Overrepresented sequences in RSV samples belonged to RSV virus infecting the cells.

scrip1t

#Overall the samples don’t require any trimming

#Mapping and calculating expressions using RSEM + quality control of mapping

#Copying The STAR Index folder from the previous analysis

scrip1t

#Using nano for making and editing the bash script

scrip1t

#The script

scrip1t

#Making the bash script executable and running it with nohup in the background. Cheking the process with cat nohup.out

scrip1t

#additional Script for postmapping quality check

scrip1t

#The mapping quality check will be moved to mapping_quality_results directory and we will download them with filezilla from the server.

#Post-Mapping Results:

scrip1t

scrip1t

#The higher percentage of unaligned sequences in RSV samples is to be expected

Part 2

Import the quantification tables obtained in the first part and produce a counts table.

#Setting up the working directory and loading packages

getwd()
## [1] "/home/behrad/Desktop/Series 8"
setwd("/home/behrad/Desktop/Series 8/")
library(edgeR)
## Loading required package: limma
library("vidger")
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(openxlsx)
library(ggvenn)
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## Loading required package: grid
## Loading required package: ggplot2
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.4     ✔ tibble    3.2.1
## ✔ purrr     1.0.4     ✔ tidyr     1.3.1
## ✔ readr     2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

#Reading data

Mock_1 <- read.table("expression_results/Series8_A549_Mock_1.genes.results", header = TRUE, sep ="\t", row.names = 1)
Mock_2 <- read.table("expression_results/Series8_A549_Mock_2.genes.results", header = TRUE, sep ="\t", row.names = 1)
Mock_3 <- read.table("expression_results/Series8_A549_Mock_3.genes.results", header = TRUE, sep ="\t", row.names = 1)
RSV_1 <- read.table("expression_results/Series8_A549_RSV_1.genes.results", header = TRUE, sep ="\t", row.names = 1)
RSV_2 <- read.table("expression_results/Series8_A549_RSV_2.genes.results", header = TRUE, sep ="\t", row.names = 1)
RSV_3 <- read.table("expression_results/Series8_A549_RSV_3.genes.results", header = TRUE, sep ="\t", row.names = 1)

#Creating expected counts table from RSEM expected counts of every sample

Expected_Counts_Table <- cbind(Mock_1[,4], Mock_2[,4], Mock_3[,4], RSV_1[,4], RSV_2[,4], RSV_3[,4])

colnames(Expected_Counts_Table) <- c("Mock_1", "Mock_2", "Mock_3", "RSV_1", "RSV_2", "RSV_3")

rownames(Expected_Counts_Table) <- row.names(Mock_1)

head(Expected_Counts_Table) #let's see what it looks like
##          Mock_1 Mock_2 Mock_3 RSV_1 RSV_2 RSV_3
## A1BG      88.00  25.00  28.00 28.03 14.00 25.46
## A1BG-AS1  14.98   5.34  17.12 16.08  6.33 18.33
## A1CF       3.00   5.00  13.00  4.00  2.00  1.00
## A2M        1.00   1.00   2.00  4.00  2.00  2.00
## A2M-AS1    2.00  12.00  15.00 10.00  5.00  5.00
## A2ML1      0.00   0.00   0.00  0.00  0.00  0.00

Prepare your data for DGE analysis

#Creating edgeR DGEList object from expected counts table

DG <- DGEList(counts = Expected_Counts_Table, group = c(rep("Mock", 3), rep("RSV", 3)),
              samples = colnames(Expected_Counts_Table), remove.zeros = TRUE)
## Removing 8909 rows with all zero counts

#Having a look

head(Expected_Counts_Table)
##          Mock_1 Mock_2 Mock_3 RSV_1 RSV_2 RSV_3
## A1BG      88.00  25.00  28.00 28.03 14.00 25.46
## A1BG-AS1  14.98   5.34  17.12 16.08  6.33 18.33
## A1CF       3.00   5.00  13.00  4.00  2.00  1.00
## A2M        1.00   1.00   2.00  4.00  2.00  2.00
## A2M-AS1    2.00  12.00  15.00 10.00  5.00  5.00
## A2ML1      0.00   0.00   0.00  0.00  0.00  0.00
head(DG$counts)
##          Mock_1 Mock_2 Mock_3 RSV_1 RSV_2 RSV_3
## A1BG      88.00  25.00  28.00 28.03 14.00 25.46
## A1BG-AS1  14.98   5.34  17.12 16.08  6.33 18.33
## A1CF       3.00   5.00  13.00  4.00  2.00  1.00
## A2M        1.00   1.00   2.00  4.00  2.00  2.00
## A2M-AS1    2.00  12.00  15.00 10.00  5.00  5.00
## A3GALT2    0.00   0.00   0.00  0.00  0.00  1.00
head(cpm(DG))
##              Mock_1    Mock_2    Mock_3     RSV_1     RSV_2      RSV_3
## A1BG     18.4650273 4.8261747 3.1494986 2.2271203 2.1538769 2.24617399
## A1BG-AS1  3.1432512 1.0308709 1.9256934 1.2776345 0.9738601 1.61713941
## A1CF      0.6294896 0.9652349 1.4622672 0.3178195 0.3076967 0.08822364
## A2M       0.2098299 0.1930470 0.2249642 0.3178195 0.3076967 0.17644729
## A2M-AS1   0.4196597 2.3165639 1.6872314 0.7945488 0.7692418 0.44111822
## A3GALT2   0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.08822364
dim(Expected_Counts_Table)
## [1] 28250     6
dim(cpm(DG))
## [1] 19341     6

#Removing genes with very low expression (keeping those with at least 10 cpm in at least 2 samples)

DG_keep <- rowSums(cpm(DG) > 10) >= 2
DG <- DG[DG_keep, ,keep.lib.sizes = FALSE]
dim(cpm(DG))
## [1] 10185     6

#Calculate scaling factors to normalize library sizes - TMM NORMALIZATION

DG$samples$norm.factors  #Initial factors (all ones, we still need to normalize)
## [1] 1 1 1 1 1 1
head(cpm(DG))            #Non normalized CPM 
##              Mock_1    Mock_2    Mock_3     RSV_1     RSV_2     RSV_3
## A4GALT   24.0068086 14.884417 12.217498 11.661062 11.991300 13.485100
## AAAS    107.2870651 87.739720 86.093397 45.920457 49.055320 41.169743
## AACS     18.2706685 71.092675 67.481694 65.141103 65.562824 55.815813
## AADAC     4.6738919 12.730093 16.099694 22.437491 20.867977 22.772852
## AADACP1   1.0622482  5.092037  9.020396 10.696008  7.319365 10.359415
## AADAT     0.8497985 11.359160 11.532405  4.423161  6.073516  3.393601
DG <- calcNormFactors(DG)  #We compute the scaling factors for each library

DG$samples$norm.factors    #The scaling factors
## [1] 0.5235643 1.1989890 1.2078215 1.1078303 1.0743780 1.1081070
head(cpm(DG)) 
##             Mock_1    Mock_2    Mock_3     RSV_1     RSV_2     RSV_3
## A4GALT   45.852650 12.414140 10.115318 10.526036 11.161156 12.169493
## AAAS    204.916712 73.178086 71.279901 41.450804 45.659273 37.153220
## AACS     34.896707 59.293851 55.870586 58.800615 61.023981 50.370418
## AADAC     8.927065 10.617356 13.329531 20.253545 19.423310 20.551131
## AADACP1   2.028878  4.246942  7.468319  9.654916  6.812653  9.348750
## AADAT     1.623103  9.473949  9.548103  3.992634  5.653053  3.062521

Draw an MDS plot

#Multidimensional scaling plot of samples

plotMDS(DG, cex = 0.6, col = c(rep("blue", 3), rep("red", 3)))

DG <- estimateDisp(DG)
## Using classic mode.
head(DG$tagwise.dispersion)
## [1] 0.27829500 0.19452484 0.09477444 0.06399553 0.14462380 0.25665422
plotBCV(DG)

Perform the DGE analysis

#Perform test for differential expression between the two conditions.

Diff_expr_test <- exactTest(DG, c("Mock", "RSV"))

#Select genes with significant diff. expr. (FDR <= 0.01)

Diff_expr_test_sign <- topTags(Diff_expr_test, n = nrow(Diff_expr_test), p.value = 0.01)

#Having a look at the DEGs

head(Diff_expr_test_sign$table)
##          logFC   logCPM       PValue          FDR
## IFIT1 6.804468 7.905784 4.173923e-46 4.251141e-42
## MX1   6.494524 6.919100 3.444277e-45 1.753998e-41
## IFIT3 6.099768 7.291971 4.623207e-44 1.249133e-40
## IFIT2 7.340081 7.800481 4.905775e-44 1.249133e-40
## OASL  8.051167 7.188271 1.635817e-42 3.332159e-39
## IFI44 8.233885 4.173575 4.505796e-37 7.648589e-34
tail(Diff_expr_test_sign$table)
##            logFC   logCPM       PValue         FDR
## COQ2   -1.313791 4.324475 0.0006706830 0.009899864
## SPSB2  -2.503310 5.045460 0.0006725495 0.009913049
## CDC25C -1.510851 4.150944 0.0006743019 0.009924515
## FBF1   -2.014673 4.609588 0.0006769504 0.009945449
## LIFR    1.929748 4.001798 0.0006782455 0.009945449
## WDR62  -1.749110 6.146809 0.0006786536 0.009945449

#Number of upregulaated genes - that is genes that are more expressed in Infected w.r.t. Mock

sum(Diff_expr_test_sign$table$logFC > 0)
## [1] 434

Draw a few plots to illustrate your results

#Number of downregulated genes - that is genes that are less expressed in Infected w.r.t. Mock

sum(Diff_expr_test_sign$table$logFC < 0)
## [1] 261
UP_regulated_genes <- row.names(Diff_expr_test_sign$table)[Diff_expr_test_sign$table$logFC > 0]
DOWN_regulated_genes <- row.names(Diff_expr_test_sign$table)[Diff_expr_test_sign$table$logFC < 0]

boxplot(cpm(DG)[UP_regulated_genes,], notch = TRUE, outline = FALSE, col = c(rep("blue", 3),
                                                                             rep("red", 3)), ylab = "CPM", xlab = "Samples", main = "CPM Distr. UP Regulated genes in RSV infected A549 cells")

boxplot(cpm(DG)[DOWN_regulated_genes,], notch = TRUE, outline = FALSE, col = c(rep("blue", 3), rep("red", 3)), ylab = "CPM", xlab = "Samples", main = "CPM Distr. DOWN Regulated genes in RSV infected A549 cells")

colors <- ifelse(Diff_expr_test$table$PValue <= 0.01, ifelse(Diff_expr_test$table$logFC > 0, "red", "blue"), "black")

plot(Diff_expr_test$table$logFC, -log10(Diff_expr_test$table$PValue), cex = 0.1, ylim = c(0,50), ylab = "-log10 Pvalue", xlab = "LogFC", main = "Vulcano Plot", col = colors)

#Scatterplot

vsScatterPlot(x = "Mock", y = "RSV", data = DG, type = "edger")

#Volcano plot i.e. a scatterplot of logFC vs FDR (adh pv)

vsVolcano(x = "Mock", y = "RSV", data = DG, type = "edger", padj = 0.01)
## Warning in eval(quote(list(...)), env): NAs introduced by coercion

#MA plot i.e. a scatter plot of expression vs logFC

vsMAPlot(x = "Mock", y = "RSV", data = DG, type = "edger", padj = 0.01)
## Warning in eval(quote(list(...)), env): NAs introduced by coercion

Saving the tables

#Saving all the results into an excel file

wb <- createWorkbook()

addWorksheet(wb, "Expected Counts")
addWorksheet(wb, "CPM")
addWorksheet(wb, "Diff.Expr.All.Genes")
addWorksheet(wb, "Diff.Expr.SIGN")

writeData(wb = wb, sheet = "Expected Counts", x = Expected_Counts_Table, rowNames = TRUE)
writeData(wb = wb, sheet = "CPM", x = cpm(DG), rowNames = TRUE)
writeData(wb = wb, sheet = "Diff.Expr.All.Genes", x = Diff_expr_test$table, rowNames = TRUE)
writeData(wb = wb, sheet = "Diff.Expr.SIGN", x = Diff_expr_test_sign$table, rowNames = TRUE)

saveWorkbook(wb, "Diff_expr_analysis_series_8.xlsx", overwrite = TRUE)

Drawing the Venn Diagram

#Loading differentailly expressed genes from Series 6 and Series 8

Series_6 <- read.xlsx("Diff_expr_analysis_series_6.xlsx", sheet = "Diff.Expr.SIGN", rowNames = T)
Series_8 <- read.xlsx("Diff_expr_analysis_series_8.xlsx", sheet = "Diff.Expr.SIGN", rowNames = T)

#Seperating Up and Down regulated genes from both series

Up_reg_S6 <- Series_6 %>%
  filter(logFC > 0) %>%
  row.names()

Up_reg_S8 <- Series_8 %>%
  filter(logFC > 0)%>%
  row.names()

Down_reg_S6 <- Series_6 %>%
  filter(logFC < 0)%>%
  row.names()

Down_reg_S8<- Series_8 %>%
  filter(logFC < 0)%>%
  row.names()

#Making the list for the Venn Diagram and Drawing the plot for up-regulated genes

venn_list <- list("Series 6" = Up_reg_S6, "Series 8" = Up_reg_S8)

ggvenn(data = venn_list,
       fill_color = c("skyblue", "lightpink"))

#Making the list for the Venn Diagram and Drawing the plot for Down-regulated genes

venn_list_Down <- list("Series 6" = Down_reg_S6, "Series 8" = Down_reg_S8)

ggvenn(data = venn_list_Down,
       fill_color = c("skyblue", "lightpink"))

# The End